{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "variable-metallic",
   "metadata": {
    "papermill": {
     "duration": 0.005946,
     "end_time": "2021-03-12T08:31:08.797414",
     "exception": false,
     "start_time": "2021-03-12T08:31:08.791468",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Pulls image data from covid-chestxray-dataset\n",
    "Pulls image data from https://github.com/ieee8023/covid-chestxray-dataset.git and outputs a zip folder containting the images and a csv file containing the image metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "current-correction",
   "metadata": {
    "papermill": {
     "duration": 3.904143,
     "end_time": "2021-03-12T08:31:12.707529",
     "exception": false,
     "start_time": "2021-03-12T08:31:08.803386",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "#os.environ['create_image']='True'\n",
    "os.environ['repository']='romeokienzler'\n",
    "os.environ['version']='0.2'\n",
    "#\n",
    "#os.environ['install_requirements']='True'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "876662fa-c233-4464-b5fc-74e08d8778fe",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [],
   "source": [
    "if bool(os.environ.get('create_image',False)):\n",
    "    docker_file=\"\"\"\n",
    "    FROM registry.access.redhat.com/ubi8/python-39\n",
    "    RUN pip install ipython nbformat install gitpython~=3.1 pandas==1.2.1\n",
    "    RUN mkdir component-library\n",
    "    RUN mkdir component-library/input\n",
    "    ADD input-covid-chestxray.ipynb /component-library/input/\n",
    "    ENTRYPOINT [\"ipython\",\"/component-library/input/input-covid-chestxray.ipynb\",\"> /tmp/component.log\",\"2> /tmp/component.err\"]\n",
    "    \"\"\"\n",
    "    with open(\"Dockerfile\", \"w\") as text_file:\n",
    "        text_file.write(docker_file)\n",
    "\n",
    "    !docker build -t claimed-input-covid-chestxray:`echo $version` .\n",
    "    !docker tag claimed-input-covid-chestxray:`echo $version` `echo $repository`/claimed-input-covid-chestxray:`echo $version`\n",
    "    !docker push `echo $repository`/claimed-input-covid-chestxray:`echo $version`\n",
    "elif bool(os.environ.get('install_requirements',False)):\n",
    "    !pip install install gitpython~=3.1 pandas==1.2.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "sought-notebook",
   "metadata": {
    "papermill": {
     "duration": 0.013733,
     "end_time": "2021-03-12T08:31:18.369714",
     "exception": false,
     "start_time": "2021-03-12T08:31:18.355981",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "import git\n",
    "import os\n",
    "import shutil\n",
    "import logging\n",
    "import pandas as pd\n",
    "from shutil import copyfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "alive-compound",
   "metadata": {
    "papermill": {
     "duration": 0.014288,
     "end_time": "2021-03-12T08:31:18.391435",
     "exception": false,
     "start_time": "2021-03-12T08:31:18.377147",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# @param output_filename\n",
    "# @param metadata_filename\n",
    "# @returns images.zip\n",
    "# @returns metadata.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "heavy-seller",
   "metadata": {
    "papermill": {
     "duration": 0.01951,
     "end_time": "2021-03-12T08:31:18.419049",
     "exception": false,
     "start_time": "2021-03-12T08:31:18.399539",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "image_foldername = os.environ.get('image_foldername', 'covid-chestxray-images')\n",
    "metadata_filename = os.environ.get('metadata_filename', 'metadata.csv')\n",
    "data_dir = os.environ.get('data_dir', '.')\n",
    "skip_if_exists = bool(os.environ.get('skip_if_exists', 'False'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a9743c4-10bc-4dff-9dfb-aa1820290148",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_folder = data_dir + image_foldername\n",
    "if skip_if_exists and os.path.exists(output_folder):\n",
    "    sys.exit(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bcd0795-18e5-4e9e-aff6-d364f509059e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#data_dir = '../../data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a27199d-2fd2-4f8f-8802-d7ba88b44a4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "!git clone https://github.com/ieee8023/covid-chestxray-dataset.git"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c0527eb-3106-4eb7-8b2b-e65ec543c1a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata = pd.read_csv('covid-chestxray-dataset/metadata.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3d525d0-c6e1-4de3-9d6b-c76688a1940f",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata['finding'] = metadata['finding'].apply(lambda s: s.replace('/','_'))\n",
    "metadata = metadata[~metadata[\"finding\"].str.contains(\"todo\")]\n",
    "metadata = metadata[~metadata[\"finding\"].str.contains(\"Unknown\")]\n",
    "metadata = metadata[~metadata.filename.str.contains('.gz')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c2e60a3-7fa6-492b-960c-acea26854635",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.mkdir(output_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b77bc67-1945-4f9e-9080-8ce044bf116f",
   "metadata": {},
   "outputs": [],
   "source": [
    "folders = metadata['finding'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a6663ff-664d-4aeb-8b8f-742950ad83e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "for folder in folders:\n",
    "    os.mkdir(output_folder + '/' + folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1dfc7dc-c6b7-4ae2-ab7b-fc57ce7312be",
   "metadata": {},
   "outputs": [],
   "source": [
    "for _, row in metadata.iterrows():\n",
    "    file_name = row['filename']\n",
    "    class_name = row['finding']\n",
    "    copyfile(\n",
    "        'covid-chestxray-dataset/images' + '/' + file_name, output_folder + '/' + class_name + '/' + file_name\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39eb9c23-508e-4eb8-be50-01c2c3d4479b",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata.to_csv(data_dir + metadata_filename, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "398fc894-894b-414d-99e3-628a6ad3cd60",
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -Rf covid-chestxray-dataset"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 154.360301,
   "end_time": "2021-03-12T08:33:42.317507",
   "environment_variables": {},
   "exception": null,
   "input_path": "/home/jovyan/work/examples/pipelines/claimed_covid_ct_trusted_ai/component-library/input/input-covid-chestxray.ipynb",
   "output_path": "/home/jovyan/work/examples/pipelines/claimed_covid_ct_trusted_ai/component-library/input/input-covid-chestxray.ipynb",
   "parameters": {},
   "start_time": "2021-03-12T08:31:07.957206",
   "version": "2.2.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
